In [5]:
import nltk
from nltk.book import *
# http://www.nltk.org/book/ch01.html
*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908
In [12]:
text6
Out[12]:
<Text: Monty Python and the Holy Grail>
In [13]:
text6.concordance("swallow")
Displaying 10 of 10 matches:
is a temperate zone . ARTHUR : The swallow may fly south with the sun or the h
be carried . SOLDIER # 1 : What ? A swallow carrying a coconut ? ARTHUR : It co
o maintain air - speed velocity , a swallow needs to beat its wings forty - thr
: It could be carried by an African swallow ! SOLDIER # 1 : Oh , yeah , an Afri
OLDIER # 1 : Oh , yeah , an African swallow maybe , but not a European swallow
swallow maybe , but not a European swallow . That ' s my point . SOLDIER # 2 :
and Sir Bedevere , not more than a swallow ' s flight away , had discovered so
omething . Oh , that ' s an unladen swallow ' s flight , obviously . I mean , t
air - speed velocity of an unladen swallow ? ARTHUR : What do you mean ? An Af
o you mean ? An African or European swallow ? BRIDGEKEEPER : Huh ? I -- I don '
In [19]:
text6.similar("Soldier")
arthur yes do are if now makes
In [27]:
text6.common_contexts(["oh", "very"])
No common contexts were found
In [31]:
text6.dispersion_plot(["swallow", "European", "it", "oh", "very"])
In [37]:
len(text6)
Out[37]:
16967
In [38]:
sorted(set(text6))
Out[38]:
['!',
'!)',
'!,',
'!]',
'#',
"'",
"'!",
"',",
"'.",
"'...",
"'?",
'(',
',',
",'",
',--',
'-',
'--',
'--...',
'.',
".'",
'.)',
'..',
'...',
'...?',
'...]',
'1',
'10',
'11',
'12',
'13',
'14',
'15',
'16',
'17',
'18',
'19',
'2',
'20',
'21',
'22',
'23',
'24',
'3',
'4',
'5',
'6',
'7',
'8',
'9',
':',
';',
'?',
'?!',
'A',
'ALL',
'AMAZING',
'ANIMATOR',
'ARMY',
'ARTHUR',
'Aaaaaaaaah',
'Aaaaaaaah',
'Aaaaaah',
'Aaaah',
'Aaaaugh',
'Aaagh',
'Aaah',
'Aaauggh',
'Aaaugh',
'Aaauugh',
'Aagh',
'Aah',
'Aauuggghhh',
'Aauuugh',
'Aauuuuugh',
'Aauuuves',
'Action',
'Actually',
'African',
'Ages',
'Aggh',
'Agh',
'Ah',
'Ahh',
'Alice',
'All',
'Allo',
'Almighty',
'Alright',
'Am',
'Amen',
'An',
'Anarcho',
'And',
'Angnor',
'Anthrax',
'Antioch',
'Anybody',
'Anyway',
'Apples',
'Aramaic',
'Are',
'Arimathea',
'Armaments',
'Arthur',
'As',
'Ask',
'Assyria',
'At',
'Attila',
'Augh',
'Autumn',
'Auuuuuuuugh',
'Away',
'Ay',
'Ayy',
'B',
'BEDEVERE',
'BLACK',
'BORS',
'BRIDE',
'BRIDGEKEEPER',
'BROTHER',
'Back',
'Bad',
'Badon',
'Battle',
'Be',
'Beast',
'Bedevere',
'Bedwere',
'Behold',
'Between',
'Beyond',
'Black',
'Bloody',
'Blue',
'Bon',
'Bones',
'Book',
'Bors',
'Brave',
'Bravely',
'Bravest',
'Bread',
'Bridge',
'Bring',
'Bristol',
'Britain',
'Britons',
'Brother',
'Build',
'Burn',
'But',
'By',
'C',
'CAMERAMAN',
'CART',
'CARTOON',
'CHARACTER',
'CHARACTERS',
'CONCORDE',
'CRAPPER',
'CRASH',
'CRONE',
'CROWD',
'CUSTOMER',
'Caerbannog',
'Camaaaaaargue',
'Camelot',
'Castle',
'Chapter',
'Charge',
'Chaste',
'Cherries',
'Chicken',
'Chickennn',
'Chop',
'Christ',
'Churches',
'Cider',
'Clark',
'Clear',
'Come',
'Concorde',
'Consult',
'Cornwall',
'Could',
'Course',
'Court',
'Crapper',
'Cut',
'DEAD',
'DENNIS',
'DINGO',
'DIRECTOR',
'Dappy',
'Death',
'Defeat',
'Dennis',
'Did',
'Didn',
'Dingo',
'Dis',
'Divine',
'Do',
'Doctor',
'Does',
'Don',
'Dragon',
'Dramatically',
'ENCHANTER',
'Ecky',
'Ector',
'Eee',
'Eh',
'Enchanter',
'England',
'English',
'Erbert',
'Ere',
'Erm',
'Eternal',
'European',
'Even',
'Every',
'Everything',
'Ewing',
'Exactly',
'Excalibur',
'Excuse',
'Explain',
'FATHER',
'FRENCH',
'Far',
'Farewell',
'Father',
'Fetchez',
'Fiends',
'Fine',
'First',
'Firstly',
'Five',
'Follow',
'For',
'Forgive',
'Forward',
'Found',
'Four',
'France',
'Frank',
'French',
'GALAHAD',
'GIRLS',
'GOD',
'GREEN',
'GUARD',
'GUARDS',
'GUEST',
'GUESTS',
'Gable',
'Galahad',
'Gallahad',
'Gawain',
'Get',
'Go',
'God',
'Good',
'Gorge',
'Grail',
'Great',
'Greetings',
'Grenade',
'Guards',
'Guy',
'HEAD',
'HEADS',
'HERBERT',
'HISTORIAN',
'Ha',
'Hah',
'Hallo',
'Halt',
'Hand',
'Hang',
'Have',
'Haw',
'He',
'Hee',
'Heee',
'Heh',
'Hello',
'Help',
'Herbert',
'Here',
'Hey',
'Hic',
'Hill',
'Himself',
'His',
'Hiyaah',
'Hiyah',
'Hiyya',
'Hm',
'Hmm',
'Ho',
'Hoa',
'Hold',
'Holy',
'Honestly',
'Hoo',
'Hooray',
'How',
'Huh',
'Hurry',
'Huy',
'Huyah',
'Hya',
'Hyy',
'I',
'INSPECTOR',
'Idiom',
'Iesu',
'If',
'Iiiiives',
'Iiiives',
'In',
'Is',
'Isn',
'It',
'Ives',
'Jesus',
'Joseph',
'Just',
'KING',
'KNIGHT',
'KNIGHTS',
'Keep',
'King',
'Knight',
'Knights',
'LAUNCELOT',
'LEFT',
'LOVELY',
'LUCKY',
'Lady',
'Lake',
'Lancelot',
'Launcelot',
'Lead',
'Leaving',
'Let',
'Lie',
'Like',
'Listen',
'Loimbard',
'Look',
'Looks',
'Lord',
'Lucky',
'MAN',
'MASTER',
'MAYNARD',
'MIDDLE',
'MIDGET',
'MINSTREL',
'MONKS',
'Make',
'Man',
'May',
'Maynard',
'Meanwhile',
'Mercea',
'Message',
'Midget',
'Mind',
'Mine',
'Mmm',
'Monsieur',
'More',
'Morning',
'Most',
'Mother',
'Mud',
'Must',
'My',
'N',
'NARRATOR',
'NI',
'Nador',
'Nay',
'Neee',
'Never',
'Ni',
'Nine',
'Ninepence',
'No',
'None',
'Not',
'Nothing',
'Now',
'Nu',
'O',
'OF',
'OFFICER',
'OLD',
'OTHER',
'Of',
'Off',
'Oh',
'Ohh',
'Old',
'Olfin',
'On',
'Once',
'One',
'Ooh',
'Oooh',
'Oooo',
'Oooohoohohooo',
'Oooooooh',
'Open',
'Or',
'Order',
'Other',
'Oui',
'Our',
'Over',
'Ow',
'PARTY',
'PATSY',
'PERSON',
'PIGLET',
'PRINCE',
'PRINCESS',
'PRISONER',
'Packing',
'Patsy',
'Pendragon',
'Peng',
'Perhaps',
'Peril',
'Picture',
'Pie',
'Piglet',
'Pin',
'Please',
'Practice',
'Prepare',
'Prince',
'Princess',
'Providence',
'Psalms',
'Pull',
'Pure',
'Put',
'Quick',
'Quickly',
'Quiet',
'Quite',
'Quoi',
'RANDOM',
'RIGHT',
'ROBIN',
'ROGER',
'Rather',
'Really',
'Recently',
'Remove',
'Rheged',
'Ridden',
'Right',
'Riiight',
'Robin',
'Robinson',
'Roger',
'Round',
'Run',
'Running',
'S',
'SCENE',
'SECOND',
'SENTRY',
'SHRUBBER',
'SIR',
'SOLDIER',
'STUNNER',
'SUN',
'Said',
'Saint',
'Saxons',
'Say',
'Schools',
'See',
'Seek',
'Shall',
'She',
'Shh',
'Shrubber',
'Shrubberies',
'Shut',
'Silence',
'Silly',
'Since',
'Sir',
'Skip',
'So',
'Sorry',
'Speak',
'Splendid',
'Spring',
'Stand',
'Stay',
'Steady',
'Stop',
'Summer',
'Supposing',
'Supreme',
'Surely',
'Swamp',
'THE',
'TIM',
'Table',
'Tale',
'Tall',
'Tell',
'Thank',
'That',
'The',
'Thee',
'Then',
'There',
'Therefore',
'They',
'This',
'Those',
'Thou',
'Thpppppt',
'Thppppt',
'Thpppt',
'Thppt',
'Three',
'Throw',
'Thsss',
'Thursday',
'Thy',
'Til',
'Tim',
'Tis',
'To',
'Today',
'Together',
'Too',
'Torment',
'Tower',
'True',
'Try',
'Twenty',
'Two',
'U',
'Uh',
'Uhh',
'Ulk',
'Um',
'Umhm',
'Umm',
'Un',
'Unfortunately',
'Until',
'Use',
'Uther',
'Uugh',
'Uuh',
'VILLAGER',
'VILLAGERS',
'VOICE',
'Very',
'Victory',
'W',
'WIFE',
'WINSTON',
'WITCH',
'WOMAN',
'Waa',
'Wait',
'Walk',
'Wayy',
'We',
'Welcome',
'Well',
'What',
'When',
'Where',
'Which',
'Who',
'Whoa',
'Why',
'Will',
'Winston',
'Winter',
'With',
'Woa',
'Wood',
'Would',
'Y',
'Yapping',
'Yay',
'Yeaaah',
'Yeaah',
'Yeah',
'Yes',
'You',
'Your',
'Yup',
'ZOOT',
'Zoot',
'[',
'[...',
']',
'a',
'aaaaaah',
'aaaah',
'aaggggh',
'aaugh',
'able',
'about',
'absolutely',
'accent',
'accompanied',
'accomplished',
'act',
'acting',
'actually',
'advancing',
'adversary',
'affairs',
'afoot',
'afraid',
'after',
'again',
'against',
'agree',
'ain',
'air',
'alarm',
'alight',
'alive',
'all',
'allowed',
'almost',
'aloft',
'along',
'already',
'also',
'although',
'always',
'am',
'amazes',
'an',
'anarcho',
'anchovies',
'and',
'angels',
'anging',
'animal',
'animator',
'another',
'answer',
'answers',
'any',
'anyone',
'anything',
'anyway',
'anywhere',
'apart',
'apologise',
'appearing',
'appease',
'approacheth',
'approaching',
'aptly',
'aquatic',
'are',
'aren',
'argue',
'arm',
'armed',
'armor',
'arms',
'around',
'arrange',
'arrows',
'art',
'as',
'aside',
'ask',
'asking',
'asks',
'assault',
'assist',
'at',
'attack',
'attend',
'auntie',
'aunties',
'autocracy',
'automatically',
'autonomous',
'auuuuuuuugh',
'avenged',
'averting',
'awaaaaay',
'awaaay',
'awaits',
'away',
'awfully',
'awhile',
'b',
'baaaa',
'baby',
'back',
'bad',
'badger',
'banana',
'band',
'bang',
'bangin',
'basic',
'basis',
'bastard',
'bastards',
'bathing',
'bats',
'be',
'beacon',
'beat',
'beautiful',
'became',
'because',
'become',
'bed',
'beds',
'been',
'behaviour',
'behind',
'behold',
'being',
'bells',
'bent',
'beside',
'best',
'bet',
'better',
'between',
'beyond',
'bi',
'bicker',
'bid',
'big',
'biggest',
'binding',
'bint',
'bird',
'birds',
'biscuits',
'bit',
'bitching',
'bite',
'biters',
'bits',
'bladders',
'blanket',
'bleed',
'bleeder',
'bless',
'blessing',
'blondes',
'blood',
'bloody',
'blow',
'body',
'boil',
'boing',
'bois',
'bold',
'bond',
'bones',
'bonk',
'boom',
'bosom',
'bother',
'bottom',
'bottoms',
'bowels',
'bows',
'boys',
'brain',
'brained',
'brave',
'bravely',
'bravest',
'breadth',
'breakfast',
'breath',
'bride',
'bridge',
'bridgekeeper',
'bridges',
'bring',
'bringing',
'broken',
'brought',
'brunettes',
'brush',
'bugger',
'buggered',
'buggering',
'build',
'built',
'bum',
'bunny',
'burn',
'burned',
'burst',
'business',
'busy',
'but',
'buy',
'by',
'c',
'cadeau',
'call',
'called',
'can',
'cannot',
'capital',
'carp',
'carried',
'carries',
'carry',
'carrying',
'cart',
'cartoon',
'carve',
'carved',
'carving',
'case',
'cast',
'castanets',
'castle',
'cause',
'cave',
'centuries',
'cereals',
'ceremony',
'certain',
'certainly',
'chance',
'change',
'changed',
'chanting',
'charged',
'chastity',
'cheesy',
'chest',
'chickened',
'chickening',
'chops',
'chord',
'chorus',
'chosen',
'chu',
'clack',
'clad',
'clang',
'clank',
'clap',
'class',
'classes',
'clear',
'clever',
'climes',
'clllank',
'clop',
'closest',
'clue',
'clunk',
'coconut',
'coconuts',
'collective',
'color',
'come',
'comin',
'coming',
'command',
'commands',
'committed',
'commune',
'compared',
'completely',
'conclusion',
'conclusions',
'confuse',
'considerable',
'consulted',
'continue',
'convinced',
'cop',
'cope',
'cost',
'cough',
'could',
'couldn',
'count',
'counting',
'country',
'couple',
'courage',
'course',
'court',
'cover',
'covered',
'crash',
'creak',
'creature',
'creep',
'creeper',
'crone',
'cross',
'crossed',
'cruel',
'cry',
'crying',
'curtains',
'cut',
'd',
'dad',
'daft',
'dance',
'dancing',
'danger',
'dangerous',
'dappy',
'dare',
'daring',
'dark',
'daughter',
'day',
'de',
'dead',
'deal',
'dear',
'death',
'decided',
'decision',
'deeds',
'defeat',
'defeator',
'delirious',
'demand',
'depart',
'depressing',
'derives',
'design',
'diaphragm',
'dictating',
'dictatorship',
'did',
'didn',
'die',
'died',
'differences',
'dine',
'direction',
'dirty',
'discovered',
'discovers',
'disheartened',
'distress',
'distributing',
'do',
'doctors',
'does',
'doesn',
'dogma',
'dogs',
'doing',
'domine',
'don',
'dona',
'donaeis',
'done',
'donkey',
'door',
'doors',
'dorsal',
'doubt',
'down',
'dragging',
'dramatic',
'draw',
...]
In [39]:
len(set(text6)) / len(text6)
Out[39]:
0.1276595744680851
In [45]:
text6.count("Allo")
Out[45]:
2
In [47]:
sentence = ['Melby', 'is', 'the', 'best']
text5[16715:16735]
Out[47]:
['U86',
'thats',
'why',
'something',
'like',
'gamefly',
'is',
'so',
'good',
'because',
'you',
'can',
'actually',
'play',
'a',
'full',
'game',
'without',
'buying',
'it']
In [50]:
FreqDist(text6)
Out[50]:
FreqDist({'SCENE': 24,
'1': 76,
':': 1197,
'[': 319,
'wind': 3,
']': 312,
'clop': 39,
'KING': 1,
'ARTHUR': 225,
'Whoa': 1,
'there': 25,
'!': 801,
'SOLDIER': 24,
'#': 127,
'Halt': 3,
'Who': 25,
'goes': 1,
'?': 207,
'It': 35,
'is': 106,
'I': 255,
',': 731,
'Arthur': 36,
'son': 5,
'of': 158,
'Uther': 1,
'Pendragon': 1,
'from': 20,
'the': 299,
'castle': 18,
'Camelot': 26,
'.': 816,
'King': 27,
'Britons': 11,
'defeator': 1,
'Saxons': 1,
'sovereign': 1,
'all': 30,
'England': 2,
'Pull': 2,
'other': 5,
'one': 32,
'am': 22,
'...': 118,
'and': 135,
'this': 59,
'my': 38,
'trusty': 1,
'servant': 2,
'Patsy': 4,
'We': 60,
'have': 53,
'ridden': 2,
'length': 1,
'breadth': 1,
'land': 11,
'in': 86,
'search': 3,
'knights': 10,
'who': 21,
'will': 22,
'join': 7,
'me': 46,
'court': 3,
'at': 27,
'must': 23,
'speak': 2,
'with': 38,
'your': 75,
'lord': 3,
'master': 5,
'What': 65,
'Ridden': 1,
'on': 47,
'a': 188,
'horse': 1,
'Yes': 42,
'You': 61,
"'": 421,
're': 41,
'using': 1,
'coconuts': 3,
've': 21,
'got': 27,
'two': 8,
'empty': 2,
'halves': 1,
'coconut': 6,
'you': 204,
'bangin': 1,
'em': 3,
'together': 2,
'So': 14,
'since': 3,
'snows': 1,
'winter': 2,
'covered': 3,
'through': 9,
'kingdom': 3,
'Mercea': 2,
'--': 148,
'Where': 8,
'd': 12,
'get': 20,
'found': 8,
'them': 13,
'Found': 1,
'In': 11,
'The': 32,
's': 141,
'tropical': 1,
'do': 34,
'mean': 11,
'Well': 54,
'temperate': 1,
'zone': 1,
'swallow': 10,
'may': 9,
'fly': 1,
'south': 1,
'sun': 1,
'or': 9,
'house': 1,
'martin': 1,
'plover': 1,
'seek': 10,
'warmer': 1,
'climes': 1,
'yet': 3,
'these': 9,
'are': 52,
'not': 70,
'strangers': 1,
'to': 144,
'our': 11,
'Are': 6,
'suggesting': 1,
'migrate': 1,
'Not': 12,
'They': 8,
'could': 17,
'be': 43,
'carried': 5,
'A': 50,
'carrying': 1,
'grip': 1,
'it': 107,
'by': 14,
'husk': 1,
'question': 3,
'where': 6,
'he': 43,
'grips': 1,
'simple': 5,
'weight': 1,
'ratios': 1,
'five': 6,
'ounce': 1,
'bird': 1,
'carry': 3,
'pound': 7,
'doesn': 9,
't': 77,
'matter': 1,
'Will': 3,
'go': 22,
'tell': 24,
'that': 84,
'Court': 2,
'here': 34,
'Listen': 4,
'order': 2,
'maintain': 1,
'air': 2,
'-': 88,
'speed': 2,
'velocity': 2,
'needs': 1,
'beat': 4,
'its': 3,
'wings': 1,
'forty': 1,
'three': 8,
'times': 4,
'every': 3,
'second': 6,
'right': 21,
'Please': 10,
'Am': 1,
'm': 34,
'interested': 1,
'2': 42,
'an': 13,
'African': 4,
'Oh': 110,
'yeah': 7,
'maybe': 1,
'but': 32,
'European': 2,
'That': 22,
'point': 2,
'agree': 2,
'ask': 2,
'if': 24,
'wants': 1,
'?!': 12,
'But': 15,
'then': 15,
'course': 6,
'swallows': 5,
'non': 1,
'migratory': 1,
'they': 18,
'couldn': 1,
'bring': 1,
'back': 10,
'anyway': 4,
'Wait': 2,
'minute': 2,
'Supposing': 1,
'No': 76,
'line': 2,
'just': 25,
'use': 2,
'strand': 1,
'creeper': 1,
'held': 2,
'under': 2,
'dorsal': 1,
'guiding': 1,
'feathers': 1,
'why': 3,
'thud': 8,
'clang': 17,
'CART': 13,
'MASTER': 13,
'Bring': 14,
'out': 33,
'dead': 22,
'cough': 6,
'...]': 3,
'[...': 1,
'Ninepence': 2,
'rewr': 10,
'!]': 5,
'CUSTOMER': 13,
'Here': 4,
'DEAD': 8,
'PERSON': 8,
'Nothing': 2,
'ninepence': 1,
'Ere': 1,
'He': 46,
'says': 3,
'isn': 6,
'soon': 3,
'very': 17,
'ill': 1,
'getting': 3,
'better': 8,
'll': 19,
'stone': 2,
'moment': 1,
'can': 32,
'take': 11,
'him': 34,
'like': 15,
'against': 2,
'regulations': 1,
'don': 26,
'want': 13,
'cart': 1,
'such': 2,
'baby': 1,
'feel': 6,
'fine': 3,
'us': 32,
'favor': 2,
'hang': 1,
'around': 1,
'couple': 1,
'minutes': 2,
'won': 2,
'long': 3,
'Robinson': 1,
'lost': 3,
'nine': 1,
'today': 4,
'when': 12,
'next': 2,
'round': 3,
'Thursday': 2,
'think': 19,
'for': 33,
'walk': 1,
'fooling': 2,
'anyone': 3,
'know': 17,
'Look': 24,
'something': 3,
'singing': 31,
'happy': 3,
'whop': 4,
'Ah': 7,
'thanks': 1,
'much': 11,
'See': 2,
'Right': 27,
'All': 12,
'howl': 5,
'dunno': 1,
'Must': 1,
'king': 10,
'Why': 4,
'hasn': 1,
'shit': 2,
'over': 4,
'3': 12,
'music': 29,
'stops': 14,
'Old': 3,
'woman': 3,
'DENNIS': 21,
'Man': 3,
'Sorry': 7,
'knight': 5,
'live': 6,
'thirty': 2,
'seven': 2,
'what': 27,
'old': 7,
'call': 5,
"'.": 19,
'say': 14,
'Dennis': 4,
'didn': 11,
'were': 16,
'called': 2,
'bother': 2,
'find': 11,
'did': 10,
'sorry': 5,
'about': 13,
"',": 4,
'behind': 2,
'looked': 1,
'object': 1,
'automatically': 1,
'treat': 2,
'inferior': 1,
'eh': 7,
'nice': 8,
'And': 50,
'how': 6,
'By': 4,
'exploiting': 1,
'workers': 1,
'anging': 1,
'outdated': 1,
'imperialist': 1,
'dogma': 1,
'which': 10,
'perpetuates': 1,
'economic': 2,
'social': 1,
'differences': 1,
'society': 1,
'If': 9,
'ever': 3,
'going': 22,
'any': 6,
'progress': 1,
'WOMAN': 11,
'some': 8,
'lovely': 2,
'filth': 1,
'down': 7,
'How': 10,
'good': 11,
'lady': 2,
'we': 62,
'had': 13,
'thought': 4,
'autonomous': 1,
'collective': 1,
'yourself': 2,
'living': 3,
'dictatorship': 1,
'self': 1,
'perpetuating': 1,
'autocracy': 1,
'working': 1,
'classes': 1,
'bringing': 1,
'class': 1,
'into': 15,
'again': 10,
'only': 9,
'people': 6,
'would': 6,
'hear': 7,
'please': 8,
'haste': 1,
'lives': 2,
'Then': 8,
'told': 2,
'anarcho': 1,
'syndicalist': 1,
'commune': 1,
'turns': 1,
'act': 1,
'as': 15,
'sort': 3,
'executive': 3,
'officer': 2,
'week': 1,
'decision': 1,
'ratified': 1,
'special': 3,
'bi': 1,
'weekly': 1,
'meeting': 1,
'see': 17,
'majority': 2,
'case': 3,
'purely': 1,
'internal': 1,
'affairs': 1,
',--': 2,
'Be': 3,
'quiet': 5,
'thirds': 1,
'more': 13,
'major': 1,
'Order': 1,
'does': 8,
'Heh': 10,
'vote': 2,
'kings': 2,
'become': 2,
'Lady': 1,
'Lake': 1,
'angels': 6,
'sing': 7,
'her': 40,
'arm': 4,
'clad': 1,
'purest': 1,
'shimmering': 1,
'samite': 1,
'aloft': 1,
'Excalibur': 2,
'bosom': 1,
'water': 4,
'signifying': 1,
'Divine': 1,
'Providence': 1,
'was': 24,
'strange': 2,
'women': 1,
'lying': 1,
'ponds': 1,
'distributing': 1,
'swords': 1,
'no': 55,
'basis': 1,
'system': 3,
'government': 1,
'Supreme': 1,
'power': 2,
'derives': 1,
'mandate': 1,
'masses': 1,
'farcical': 1,
'aquatic': 1,
'ceremony': 1,
'expect': 1,
'wield': 1,
'supreme': 1,
'cause': 2,
'watery': 1,
'tart': 1,
'threw': 1,
'sword': 3,
'Shut': 12,
'up': 36,
'went': 3,
'saying': 6,
'emperor': 1,
'because': 2,
'moistened': 1,
'bint': 1,
'lobbed': 1,
'scimitar': 1,
'put': 2,
'away': 38,
'now': 12,
'violence': 2,
'inherent': 2,
'Come': 26,
'Help': 1,
'help': 5,
'being': 3,
'repressed': 1,
'Bloody': 2,
'peasant': 1,
'give': 5,
'Did': 6,
'repressing': 1,
'saw': 17,
'4': 2,
'BLACK': 32,
'KNIGHT': 68,
'Aaagh': 4,
'GREEN': 5,
'Ooh': 5,
'stab': 1,
'Aagh': 2,
'Agh': 6,
'!,': 8,
'oh': 2,
'etc': 4,
'Aaaaaah': 1,
'Aaaaaaaaah': 1,
'woosh': 3,
'kills': 2,
'scrape': 1,
'Umm': 1,
'fight': 4,
'strength': 2,
'many': 4,
'men': 3,
'Sir': 52,
'Knight': 11,
'pause': 6,
'finest': 1,
'bravest': 1,
'proved': 1,
'worthy': 3,
'make': 12,
'sad': 2,
'None': 2,
'shall': 21,
'pass': 4,
'quarrel': 1,
'cross': 4,
'bridge': 2,
'die': 2,
'command': 2,
'stand': 3,
'aside': 2,
'move': 1,
'man': 12,
'Aaah': 3,
'hiyaah': 1,
'chops': 4,
'left': 2,
'off': 18,
'Now': 9,
'adversary': 1,
'Tis': 2,
'scratch': 2,
'Your': 1,
'worse': 1,
'liar': 1,
'pansy': 1,
'Huyah': 1,
'Hiyaah': 1,
'Aaaaaaaah': 1,
'Victory': 1,
'mine': 2,
'kneeling': 1,
'thank': 5,
'Thee': 2,
'Lord': 9,
'Thy': 1,
'mer': 1,
'Hah': 2,
'clunk': 5,
'Have': 3,
'kick': 4,
'Eh': 2,
'indeed': 1,
'brave': 17,
'enough': 4,
'stupid': 1,
'bastard': 1,
'arms': 1,
'Just': 10,
'flesh': 1,
'wound': 1,
'stop': 5,
'Chicken': 2,
'Chickennn': 1,
'leg': 3,
'bleed': 1,
'invincible': 1,
'looney': 1,
'Black': 4,
'always': 3,
'triumphs': 1,
'last': 7,
'draw': 1,
'Running': 1,
'yellow': 1,
'bastards': 1,
'coming': 2,
'bite': 1,
'legs': 1,
'5': 1,
'MONKS': 6,
'chanting': 6,
'Pie': 10,
'Iesu': 10,
'domine': 10,
'dona': 8,
'eis': 7,
'requiem': 8,
'bonk': 5,
'CROWD': 17,
'witch': 37,
'Burn': 32,
'VILLAGER': 47,
'May': 2,
'burn': 4,
'BEDEVERE': 61,
'she': 11,
'She': 7,
'looks': 2,
'Yeah': 8,
'forward': 2,
'WITCH': 4,
'Uh': 16,
'dressed': 2,
'Augh': 2,
'nose': 5,
'false': 1,
'hat': 1,
'Yeaaah': 1,
'Yeaah': 1,
'dress': 1,
'VILLAGERS': 2,
'bit': 10,
'has': 11,
'wart': 1,
'RANDOM': 8,
'makes': 1,
'turned': 3,
'newt': 2,
'Quiet': 4,
'There': 15,
'ways': 4,
'telling': 1,
'whether': 2,
'Tell': 3,
'witches': 4,
'apart': 3,
'More': 1,
'Shh': 15,
'Wood': 1,
'B': 4,
'--...': 1,
'made': 4,
'wood': 5,
'Good': 5,
'heh': 12,
'Build': 1,
'also': 2,
'bridges': 1,
'True': 1,
'Uhh': 2,
'Does': 1,
'sink': 1,
'floats': 3,
'Throw': 2,
'pond': 3,
'Bread': 1,
'Apples': 1,
'small': 2,
'rocks': 1,
'Cider': 1,
'gra': 1,
'gravy': 1,
'Cherries': 1,
'Mud': 1,
'Churches': 2,
'Lead': 2,
'duck': 5,
'Oooh': 7,
'Exactly': 1,
'logically': 1,
'weighs': 1,
'same': 4,
'therefore': 1,
'Use': 1,
'quack': 3,
'largest': 1,
'scales': 1,
'Ohh': 10,
'Ahh': 5,
'Remove': 1,
'supports': 1,
'creak': 2,
'fair': 3,
'cop': 1,
'so': 20,
'wise': 2,
'science': 1,
'My': 8,
'liege': 11,
'come': 18,
'Round': 8,
'Table': 8,
'honored': 1,
'name': 17,
'Bedevere': 6,
'dub': 1,
'NARRATOR': 17,
'first': 3,
'illustrious': 1,
'names': 2,
'follow': 1,
'Lancelot': 2,
'Brave': 5,
';': 4,
'Gallahad': 1,
'Pure': 1,
'Robin': 19,
'quite': 7,
'nearly': 2,
'fought': 2,
'Dragon': 1,
'Angnor': 1,
'stood': 1,
'vicious': 2,
'Bristol': 1,
'personally': 1,
'wet': 1,
'himself': 1,
'Battle': 1,
'Badon': 1,
'Hill': 1,
'aptly': 1,
'named': 1,
'appearing': 1,
'film': 1,
'Together': 1,
'formed': 1,
'band': 1,
'whose': 2,
'deeds': 1,
'retold': 1,
'throughout': 1,
'centuries': 1,
'Knights': 18,
'6': 1,
'SIR': 4,
'earth': 1,
'banana': 1,
'shaped': 3,
'This': 8,
'new': 3,
'learning': 1,
'amazes': 1,
'Explain': 1,
'sheep': 1,
'bladders': 1,
'employed': 1,
'prevent': 1,
'earthquakes': 1,
'certainly': 1,
'sir': 16,
'LAUNCELOT': 76,
'trumpets': 4,
'GALAHAD': 69,
'PATSY': 1,
'model': 1,
'bid': 1,
'welcome': 1,
'home': 2,
'Let': 9,
'ride': 1,
'medieval': 2,
'hall': 3,
'KNIGHTS': 37,
'table': 1,
'dance': 1,
'e': 5,
'er': 2,
'able': 2,
'routines': 1,
'chorus': 1,
'scenes': 2,
'With': 2,
'footwork': 1,
'impeccable': 1,
'dine': 1,
'well': 8,
'eat': 2,
'ham': 1,
'jam': 1,
'spam': 1,
'lot': 7,
'dancing': 2,
'Our': 3,
'shows': 1,
'formidable': 1,
'given': 2,
'rhymes': 1,
'unsingable': 1,
'opera': 1,
'mad': 1,
'diaphragm': 1,
'dungeon': 1,
'PRISONER': 1,
'clap': 15,
'tap': 1,
'war': 1,
'tough': 1,
'Quite': 2,
'indefatigable': 1,
'Between': 1,
'quests': 1,
'sequin': 1,
'vests': 1,
'impersonate': 1,
'Clark': 1,
'Gable': 1,
'busy': 2,
'life': 4,
'MAN': 9,
'push': 2,
'pram': 1,
'outdoors': 1,
'let': 7,
'silly': 7,
'place': 5,
'7': 1,
'boom': 45,
'GOD': 6,
'grovel': 1,
'One': 5,
'thing': 4,
'groveling': 1,
'apologise': 1,
'Every': 1,
'time': 11,
'try': 2,
'talk': 3,
'someone': 4,
'forgive': 1,
'doing': 4,
'averting': 1,
'eyes': 3,
'O': 11,
'those': 4,
'miserable': 1,
'Psalms': 1,
'depressing': 1,
'knock': 1,
'task': 2,
'example': 1,
'dark': 2,
'idea': 4,
'Course': 2,
'Behold': 2,
'Holy': 24,
'Grail': 34,
'sacred': 7,
'grail': 5,
'purpose': 1,
'quest': 13,
'blessing': 2,
'God': 11,
'praised': 2,
'8': 1,
'horn': 1,
'Hallo': 2,
'FRENCH': 29,
'GUARD': 53,
'Allo': 2,
'eet': 1,
'Guy': 1,
'de': 2,
'Loimbard': 1,
'Go': 7,
'been': 7,
'charged': 1,
'food': 2,
'shelter': 1,
'night': 1,
'keen': 1,
'already': 4,
'sure': 12,
'yes': 11,
'(': 3,
'.)': 2,
'u': 2,
'um': 10,
'look': 7,
'Of': 3,
'English': 7,
'types': 2,
'French': 6,
'outrageous': 1,
'accent': 1,
'Mind': 1,
'own': 4,
'business': 1,
'show': 3,
'force': 2,
'frighten': 1,
'pig': 2,
'dogs': 1,
'boil': 1,
'bottom': 3,
'sons': 2,
'person': 4,
'blow': 2,
'k': 3,
'nnnnniggets': 1,
'Thpppppt': 1,
'Thppt': 2,
'wanna': 1,
'headed': 1,
'animal': 1,
'trough': 1,
'wiper': 1,
'fart': 1,
'general': 1,
'direction': 2,
'mother': 2,
'hamster': 1,
'father': 7,
'smelt': 1,
'elderberries': 1,
'Is': 5,
'else': 5,
'taunt': 2,
'sniff': 3,
'chance': 4,
'than': 4,
'reasonable': 1,
'Fetchez': 2,
'la': 2,
'vache': 2,
'OTHER': 1,
'Quoi': 2,
'!)': 1,
'mooo': 1,
'commands': 1,
'twong': 4,
'mooooooo': 1,
'Jesus': 3,
'Christ': 5,
'Charge': 4,
'mayhem': 2,
'Hey': 4,
'dad': 2,
'Run': 21,
'Thppppt': 1,
'GUARDS': 8,
'taunting': 5,
'Fiends': 1,
'tear': 1,
'plan': 1,
'later': 1,
'bang': 1,
'squeak': 29,
'rrrr': 3,
'drilllll': 1,
'sawwwww': 1,
'crash': 4,
'whispering': 2,
'C': 1,
'est': 1,
'un': 2,
'lapin': 2,
'bois': 1,
'Un': 1,
'cadeau': 2,
'present': 1,
'Oui': 1,
'oui': 1,
'Hurry': 5,
'On': 2,
'y': 3,
'va': 1,
'Bon': 1,
'magne': 1,
'Over': 2,
'clllank': 1,
'happens': 1,
'uh': 26,
'Launcelot': 25,
'Galahad': 11,
'wait': 1,
'until': 3,
'nightfall': 1,
'leap': 3,
...})
"the set of all w such that w is an element of V (the vocabulary) and w has property P".
{w | w ∈ V & P(w)}
[w for w in V if p(w)]
In [55]:
V = set(text1)
long_words = [w for w in V if len(w) > 16]
sorted(long_words)
Out[55]:
['cannibalistically',
'characteristically',
'circumnavigations',
'comprehensiveness',
'indispensableness',
'preternaturalness',
'subterraneousness',
'superstitiousness',
'uncomfortableness',
'uncompromisedness',
'uninterpenetratingly']
To get a handle on collocations, we start off by extracting from a text a list of word pairs, also known as bigrams. This is easily accomplished with the function bigrams():
In [60]:
#list(bigrams(['more', 'is', 'said', 'than', 'done']))
text6.collocations()
BLACK KNIGHT; clop clop; HEAD KNIGHT; mumble mumble; Holy Grail;
squeak squeak; FRENCH GUARD; saw saw; Sir Robin; Run away; CARTOON
CHARACTER; King Arthur; Iesu domine; Pie Iesu; DEAD PERSON; Round
Table; clap clap; OLD MAN; dramatic chord; dona eis
In [63]:
fdist = FreqDist(len(w) for w in text6)
fdist.most_common()
Out[63]:
[(1, 5982),
(3, 2375),
(4, 2298),
(2, 2176),
(5, 1450),
(6, 1207),
(7, 713),
(8, 395),
(9, 254),
(10, 49),
(11, 31),
(12, 31),
(13, 6)]
In [64]:
fdist.plot(cumulative=True)
In [65]:
[len(w) for w in text6]
Out[65]:
[5,
1,
1,
1,
4,
1,
1,
4,
4,
4,
1,
4,
6,
1,
4,
5,
1,
1,
4,
4,
4,
1,
7,
1,
1,
1,
4,
1,
3,
4,
5,
1,
6,
1,
2,
2,
1,
1,
6,
1,
3,
2,
5,
9,
1,
4,
3,
6,
2,
7,
1,
4,
2,
3,
7,
1,
8,
2,
3,
6,
1,
9,
2,
3,
7,
1,
7,
1,
1,
1,
4,
3,
5,
3,
1,
6,
1,
1,
2,
1,
3,
3,
4,
2,
2,
6,
7,
5,
1,
2,
4,
6,
3,
6,
3,
7,
2,
3,
4,
2,
6,
2,
7,
3,
4,
4,
2,
2,
2,
5,
2,
7,
1,
1,
4,
5,
4,
4,
4,
3,
6,
1,
7,
1,
1,
1,
4,
1,
6,
2,
1,
5,
1,
6,
1,
3,
1,
7,
1,
1,
1,
3,
1,
2,
5,
8,
1,
6,
1,
4,
1,
7,
1,
1,
1,
3,
1,
2,
3,
3,
5,
6,
2,
7,
3,
3,
1,
2,
6,
1,
1,
2,
8,
1,
6,
1,
2,
1,
2,
4,
6,
5,
3,
5,
2,
6,
7,
4,
4,
1,
7,
3,
7,
2,
6,
1,
7,
2,
7,
1,
1,
1,
5,
1,
1,
3,
3,
3,
8,
1,
6,
1,
2,
5,
4,
1,
7,
1,
1,
1,
5,
4,
1,
2,
6,
1,
3,
7,
1,
1,
8,
1,
6,
1,
4,
2,
3,
4,
1,
7,
1,
1,
1,
4,
1,
4,
2,
1,
9,
4,
1,
6,
1,
3,
7,
3,
3,
5,
4,
3,
3,
2,
3,
5,
6,
2,
3,
6,
3,
4,
6,
6,
2,
6,
1,
3,
5,
3,
3,
9,
2,
3,
4,
1,
7,
1,
1,
1,
3,
3,
10,
8,
7,
1,
6,
1,
3,
2,
3,
1,
4,
5,
2,
7,
1,
7,
1,
1,
1,
4,
1,
1,
7,
8,
1,
7,
1,
6,
1,
2,
5,
4,
2,
2,
3,
4,
1,
7,
1,
1,
1,
2,
1,
1,
3,
1,
8,
2,
5,
2,
5,
2,
1,
2,
1,
1,
1,
6,
8,
2,
6,
6,
1,
1,
4,
5,
4,
5,
3,
5,
1,
3,
5,
7,
1,
6,
1,
4,
1,
2,
5,
1,
1,
6,
1,
4,
3,
2,
3,
4,
4,
6,
4,
6,
4,
3,
5,
2,
7,
2,
4,
1,
7,
1,
1,
1,
6,
1,
2,
5,
2,
8,
3,
1,
5,
8,
1,
1,
7,
5,
2,
4,
3,
5,
5,
1,
5,
5,
5,
6,
1,
5,
1,
6,
1,
6,
1,
7,
1,
1,
1,
2,
1,
5,
1,
6,
1,
1,
1,
1,
3,
10,
1,
7,
1,
1,
1,
2,
5,
2,
7,
2,
2,
7,
7,
1,
7,
1,
1,
1,
2,
1,
4,
1,
2,
7,
7,
5,
1,
3,
3,
1,
8,
7,
1,
4,
1,
1,
2,
5,
1,
7,
1,
1,
1,
2,
1,
4,
1,
1,
5,
4,
4,
1,
6,
1,
4,
3,
3,
4,
6,
2,
2,
5,
2,
4,
2,
5,
2,
7,
2,
7,
1,
1,
1,
3,
4,
2,
6,
1,
2,
7,
8,
3,
3,
1,
9,
1,
7,
1,
1,
1,
2,
1,
4,
3,
7,
1,
1,
1,
2,
4,
6,
1,
1,
5,
1,
7,
4,
6,
3,
1,
4,
4,
4,
1,
7,
1,
1,
1,
4,
1,
6,
1,
9,
3,
8,
7,
2,
8,
1,
7,
1,
1,
1,
2,
1,
4,
1,
1,
4,
2,
4,
2,
2,
1,
4,
1,
7,
1,
1,
1,
4,
1,
6,
1,
4,
1,
1,
4,
3,
1,
6,
2,
7,
1,
7,
1,
1,
1,
4,
1,
4,
5,
3,
6,
7,
8,
1,
7,
1,
1,
1,
4,
1,
3,
3,
1,
5,
1,
1,
1,
4,
1,
1,
5,
1,
4,
1,
6,
1,
5,
3,
4,
4,
1,
1,
5,
1,
5,
3,
4,
4,
1,
1,
5,
1,
5,
3,
4,
4,
1,
1,
5,
1,
5,
3,
4,
4,
1,
1,
5,
1,
5,
3,
4,
4,
1,
1,
5,
5,
4,
1,
5,
1,
4,
5,
5,
1,
5,
3,
4,
4,
1,
1,
5,
1,
5,
3,
4,
4,
1,
1,
5,
1,
5,
3,
4,
4,
1,
9,
1,
1,
5,
1,
5,
3,
4,
4,
1,
1,
5,
1,
5,
3,
4,
4,
1,
1,
5,
1,
5,
3,
3,
1,
4,
2,
3,
4,
4,
1,
1,
4,
2,
1,
5,
1,
5,
3,
4,
4,
1,
8,
1,
4,
1,
1,
3,
1,
4,
1,
6,
1,
9,
1,
4,
6,
1,
1,
1,
1,
3,
4,
1,
4,
1,
6,
1,
4,
1,
8,
1,
7,
1,
4,
1,
1,
4,
9,
1,
4,
6,
1,
1,
1,
1,
3,
4,
1,
4,
1,
6,
1,
1,
3,
1,
2,
4,
2,
1,
1,
3,
4,
1,
8,
1,
3,
2,
2,
1,
4,
6,
1,
1,
1,
1,
3,
1,
4,
1,
6,
1,
2,
3,
1,
1,
1,
8,
1,
4,
1,
2,
4,
2,
4,
1,
2,
1,
1,
4,
3,
1,
4,
6,
1,
1,
1,
1,
7,
6,
1,
8,
1,
2,
3,
1,
2,
3,
1,
3,
1,
2,
2,
5,
4,
2,
1,
6,
1,
4,
1,
6,
1,
2,
1,
1,
3,
1,
1,
4,
3,
4,
4,
1,
2,
1,
1,
7,
11,
1,
4,
6,
1,
1,
3,
1,
1,
4,
2,
2,
2,
3,
4,
1,
8,
1,
2,
1,
3,
1,
1,
2,
4,
1,
4,
1,
4,
1,
6,
1,
1,
3,
1,
1,
4,
3,
1,
4,
6,
1,
1,
4,
4,
1,
8,
1,
4,
1,
2,
2,
1,
5,
1,
4,
1,
6,
1,
1,
3,
1,
1,
1,
8,
1,
4,
1,
3,
3,
4,
6,
1,
6,
2,
7,
1,
2,
3,
1,
1,
2,
4,
1,
4,
1,
6,
1,
2,
1,
1,
1,
2,
3,
2,
2,
2,
3,
8,
1,
1,
1,
...]
In [ ]:
Content source: Sebbenbear/notebooks
Similar notebooks: